# import relevant modules
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from datetime import datetime, timedelta
import plotly.express as px
from plotly.subplots import make_subplots
from tqdm.notebook import tqdm
import plotly.graph_objects as go
import glob
import sys
sys.path.append('../scripts/')
from analysis import get_correlation, peak_analysis, peak_ranges
from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf
# load only hashtags which are relevant
topics_df = pd.read_json('../../data/BTW17_Twitter/lda/hashtag_topics.json')
hashtags = topics_df['hashtag'].tolist()
# load hashtag timeseries
hashtag_df = pd.read_json('../../data/BTW17_Twitter/hashtags/hashtag_counts.json')
hashtag_df.head(3)
# load politicans metadata and keep only relevant data
persons_df = pd.read_csv('../../data/BTW17_Suggestions/btw_politicians_demographic.csv')
persons_df.drop(columns=['Unnamed: 0', 'Born', 'Bundesland', 'Age'], inplace=True)
persons_df['Name'] = persons_df['Name'].apply(lambda x: x.lower())
persons_df.rename(columns={'Name':'queryterm', 'Party':'party', 'Gender':'gender'}, inplace=True)
persons_df.head(3)
cluster_cat = pd.read_csv('../../data/BTW17_Suggestions/suggestions/cluster_categories.csv', delimiter=',')
cluster_cat.drop(columns='Unnamed: 0', inplace=True)
cluster_cat['size'] = cluster_cat['sugg'].apply(lambda x: x.count(', ')+1)
cluster_cat.head(3)
# load suggestions timeseries
tmp = pd.read_parquet('../../data/BTW17_Suggestions/processed/suggestions.parquet')
tmp['date'] = pd.to_datetime(tmp['date']).dt.date
suggestions_df = pd.DataFrame()
suggestions_df[['date', 'queryterm', 'suggestion', 'count']] = tmp.groupby(['date', 'queryterm', 'suggestion'], as_index=False).count()
suggestions_df = suggestions_df.merge(persons_df, how='left', on='queryterm')
# load vector similarites
similarity_df = pd.read_json('../../data/BTW17_Suggestions/suggestions/vector_similarity.json')
similarity_df['hashtags'] = [hashtags for i in similarity_df.index]
similarity_df['suggestion'] = similarity_df['suggestion'].apply(lambda x: ' '.join(x))
# join suggestion cluster and group again
suggestions_df = suggestions_df.merge(similarity_df, how='inner', on='suggestion')
suggestions_df = suggestions_df.groupby(['date', 'queryterm', 'party', 'gender', 'cluster'], as_index=False).sum('count')
suggestions_df.head(3)
# remodel similarity cluster to hashtags
similarity_df = similarity_df.set_index(['suggestion', 'cluster']).apply(pd.Series.explode).reset_index()
similarity_df['similarity_scores'] = pd.to_numeric(similarity_df['similarity_scores'])
similarity_df = similarity_df.groupby(['cluster', 'hashtags'], as_index=False).mean('similarity_scores')
similarity_df = similarity_df.merge(cluster_cat, how='left', on='cluster')
# filter out category rauschen
similarity_df = similarity_df[similarity_df['category']!='Rauschen'].reset_index(drop=True)
similarity_df.head(3)
# prepare data for tlcc
# filter everything with sim_score < 0.5
sim_df = similarity_df[similarity_df['similarity_scores']>=0.5].reset_index(drop=True)
# group suggestions to cluster
cluster_df = suggestions_df.groupby(['date', 'cluster'], as_index=False).sum('count')
cluster_df.rename(columns={'count':'cluster_count'}, inplace=True)
# group suggestions per cluster and party
cluster_party_df = suggestions_df.groupby(['date', 'party', 'cluster'], as_index=False).sum('count')
cluster_party_df.rename(columns={'count':'cluster_count'}, inplace=True)
# group suggestions per cluster and gender
cluster_gender_df = suggestions_df.groupby(['date', 'gender', 'cluster'], as_index=False).sum('count')
cluster_gender_df.rename(columns={'count':'cluster_count'}, inplace=True)
hashtag_df.rename(columns={'count':'hashtag_count'}, inplace=True)
colors = px.colors.qualitative.Antique
colors.extend(px.colors.qualitative.Antique)
delays = []
for i in range(0, 71, 7):
delays.append(i)
#dfs = []
#for i in delays:
# dfs.append(get_correlation(i, hashtag_df, cluster_df, cluster_gender_df, cluster_party_df, sim_df))
#for i in range(len(dfs)):
# dfs[i].to_json(f'../../data/Analysis/df_{delays[i]}_delays.json')
# set to *.json to load all
input_loc = '../../data/Analysis/*delays.json'
input_files = glob.glob(input_loc)
dfs = []
for file in input_files:
data = pd.read_json(file)
data = data.merge(cluster_cat, how='left', on='cluster')
#data = data[(data['pearsonr']>=0)&(data['p_value']<=0.05)&(data['gender']=='all')&(data['party']=='all')]
data = data[(data['pearsonr']>=0)]
dfs.append(data)
hashtag_df
print(f'Anzahl möglicher Kombinationen: {len(similarity_df[similarity_df["category"]!="Rauschen"])}')
print(f'Anzahl relevanter Kombinationen: {len(sim_df)}')
print(f'Anzahl Kombinationen pro Hashtag: {len(sim_df)/sim_df["hashtags"].nunique()}')
print(f'Anteil relevanter Kombinationen: {round(len(sim_df[sim_df["category"]!="Rauschen"])/len(similarity_df[similarity_df["category"]!="Rauschen"])*100,2)}%')
for category in sim_df['category'].unique():
tmp = sim_df[sim_df['category']==category]
print(f'Kategorie: {category}, Anzahl relevanter Kombinationen: {tmp.groupby(["cluster", "hashtags"], as_index=False).ngroups}')
sim_df.groupby('category', as_index=False)['similarity_scores'].mean()
# load cluster_df and join categories
cluster_cat_df = pd.read_json('../../data/BTW17_Suggestions/suggestions/cluster.json')
cluster_cat_df = cluster_cat_df.merge(cluster_cat, how='left', on='cluster')
tmp = pd.DataFrame()
tmp['cluster'] = cluster_cat_df['cluster'].value_counts().index
tmp['Clustergröße'] = cluster_cat_df['cluster'].value_counts().values
tmp = tmp.merge(cluster_cat[['cluster', 'category']], how='left', on='cluster')
tmp = tmp[tmp['category']!='Rauschen']
tmp2 = cluster_cat_df.groupby('category', as_index=False)['cluster'].nunique().sort_values(by='cluster', ascending=False)
tmp2.rename(columns={'cluster': 'n_cluster'}, inplace=True)
tmp = tmp.merge(tmp2, on='category')
tmp.rename(columns={'category':'Kategorie', 'cluster':'Cluster', 'n_cluster':'Anzahl Cluster'}, inplace=True)
tmp.groupby('Kategorie', as_index=False).mean()
fig = px.scatter(cluster_cat_df, x='t-SNE(x)', y='t-SNE(y)', color='category', hover_name='suggestion',
template='simple_white', color_discrete_sequence=px.colors.qualitative.Antique)
fig.update_layout(font=dict(family='Computer Modern', color='black', size=15))
fig.show()
# regressionsanalyse
reg_df = suggestions_df.groupby(['party', 'gender', 'cluster'], as_index=False).sum()
tmp = similarity_df.groupby(['cluster', 'category'], as_index=False).mean()
reg_df = reg_df.merge(tmp, how='left', on='cluster')
reg_df.dropna(inplace=True)
reg_df = reg_df.reset_index(drop=True)
reg_df.head(3)
reg = smf.ols('similarity_scores ~ size + C(party) + C(gender) + C(category)', data=reg_df).fit()
reg.summary()
Fragestellung: Wie lange dauert die Durchdringung im Durchschnitt und nach den jeweiligen Dimensionen? Messung: TLCC mit Pearson R und p-Wert
# übersicht der korrelationen und deren p-werte pro time lag
delay_list = []
r_list = []
p_list = []
for i in range(len(dfs)):
delay_list.append(int(delays[i]/7))
df = dfs[i][(dfs[i]['gender']=='all')&(dfs[i]['party']=='all')]
r_list.append(round(df['pearsonr'].mean(),3))
p_values = df['p_value'].to_numpy()
p_list.append(round(stats.combine_pvalues(p_values)[1],3))
tmp = pd.DataFrame(data={'Time Lag (in Wochen)': delay_list, 'Pearson R': r_list, 'P-Wert': p_list})
tmp
Sämtliche Korrelationen sind signifikant (p<0.05), deshalb Betrachtung im Plot.
fig = px.line(tmp, x='Time Lag (in Wochen)', y='Pearson R',
template='simple_white', color_discrete_sequence=px.colors.qualitative.Antique)
fig.update_layout(font=dict(family='Computer Modern', color='black', size=15))
fig.show()
Plateau zwischen 6-9 Wochen, allerdings sehr geringe Korrelation. Ausnahmen bei 9 Wochen sind nur wenige:
tmp = dfs[9][(dfs[9]['gender']=='all')&(dfs[9]['party']=='all')]
tmp = tmp[tmp['pearsonr']>=0.5]
tmp.sort_values(by='pearsonr', ascending=False)[['cluster', 'hashtags', 'category_x', 'pearsonr', 'similarity_scores']]
delay_list = []
categories = []
r_list = []
p_list = []
for i in range(len(dfs)):
for category in set(similarity_df['category']):
delay_list.append(delays[i])
df = dfs[i][(dfs[i]['gender']=='all')&(dfs[i]['party']=='all')]
categories.append(category)
r_list.append(round(df[df['category_x']==category]['pearsonr'].mean(),3))
p_values = df[df['category_x']==category]['p_value'].to_numpy()
p_list.append(round(stats.combine_pvalues(p_values)[1], 3))
plot_cat = pd.DataFrame(data={'Delay': delay_list, 'Kategorie': categories, 'Pearson R': r_list, 'P-Wert': p_list})
plot_cat = plot_cat.dropna()
plot_cat = plot_cat.reset_index(drop=True)
plot_cat[plot_cat['Kategorie']=='Wirtschaft']
delay_list = []
gender_list = []
r_list = []
p_list = []
for i in range(len(dfs)):
for gender in set(suggestions_df['gender']):
delay_list.append(delays[i])
df = dfs[i][(dfs[i]['gender']!='all')&(dfs[i]['party']=='all')]
gender_list.append(gender)
df = df[df['category_x']!='Rauschen']
r_list.append(round(df[df['gender']==gender]['pearsonr'].mean(),3))
p_values = df[df['gender']==gender]['p_value'].to_numpy()
p_list.append(round(stats.combine_pvalues(p_values)[1],3))
plot_gender = pd.DataFrame(data={'Delay': delay_list, 'Geschlecht': gender_list, 'Pearson R': r_list, 'P-Wert': p_list})
plot_gender = plot_gender.dropna()
plot_gender[plot_gender['Geschlecht']=='male']
party_colors = ['rgb(0,158,224)', #afd
'rgb(50,48,46)', #cdu
'rgb(0,128,201)', #csu
'rgb(182,28,62)', #dielinke
'rgb(255,237,0)', #fdp
'rgb(70,150,43)', #grüne
'rgb(203,166,115)', #parteilos
'rgb(227,0,15)', #spd
'rgb(173,185,202)'# fraktionslos
]
delay_list = []
party_list = []
r_list = []
p_list = []
for i in range(len(dfs)):
for party in set(suggestions_df['party']):
delay_list.append(delays[i])
df = dfs[i][(dfs[i]['gender']=='all')&(dfs[i]['party']!='all')]
party_list.append(party)
df = df[df['category_x']!='Rauschen']
r_list.append(round(df[df['party']==party]['pearsonr'].mean(),3))
p_values = df[df['party']==party]['p_value'].to_numpy()
p_list.append(round(stats.combine_pvalues(p_values)[1],3))
plot_party = pd.DataFrame(data={'Delay': delay_list, 'Partei': party_list, 'Pearson R': r_list, 'P-Wert': p_list})
plot_party = plot_party.dropna()
plot_party = plot_party[plot_party['P-Wert']<0.05]
plot_party['Time Lag (in Wochen)'] = plot_party['Delay'] / 7
plot_party = plot_party.sort_values(by=['Partei', 'Delay'], ascending=True)
fig = px.line(plot_party, x='Time Lag (in Wochen)', y='Pearson R', color='Partei',
template='simple_white', color_discrete_sequence=party_colors)
fig.update_layout(font=dict(family='Computer Modern', color='black', size=15))
fig.show()
for x in plot_party['Partei'].unique():
print(x)
peaks_df = pd.read_json('../../data/BTW17_Twitter/peaks/peak_dates.json')
peaks_df['num_peaks'] = peaks_df.apply(lambda x: len(x['lda_dates']) / 7, axis=1)
peaks_df[['peak_start', 'peak_end']] = peaks_df.apply(peak_ranges, axis=1)
peaks_df.drop(columns=['index', 'num_peaks', 'lda_dates'], inplace=True)
peaks_df = peaks_df.set_index(['hashtag']).apply(pd.Series.explode).reset_index()
peaks_df.head(3)
cluster_ts_df = suggestions_df.groupby(['date', 'cluster', 'party', 'gender'], as_index=False).sum('count')
cluster_ts_df = cluster_ts_df.merge(cluster_cat[['cluster', 'category']], how='left', on='cluster')
cluster_ts_df.head(3)
#analysis_dfs = []
#
#for i in tqdm(range(len(delays[1:]))):
# test_range = delays[i+1]
# tmp = peak_analysis(test_range, sim_df, peaks_df, cluster_ts_df)
# tmp = tmp.reset_index(drop=True)
# analysis_dfs.append(tmp)
# save files
#for i in range(len(analysis_dfs)):
# analysis_dfs[i].to_json(f'../../data/Analysis/peak_analysis_detail_range_{delays[i]}.json')
# set to *.json to load all
input_loc = '../../data/Analysis/peak_analysis*.json'
input_files = glob.glob(input_loc)
analysis_dfs = []
for file in input_files:
data = pd.read_json(file)
analysis_dfs.append(data)
# print ttest results for 1, 5 and 9 weeks
for i in range(len(analysis_dfs)):
tmp = analysis_dfs[i]
if tmp['test_range'].mean() in [7,35,63]:
test_range = tmp['test_range'].unique()
a = tmp[tmp['time']=='after']['count']
b = tmp[tmp['time']=='before']['count']
results = stats.ttest_ind(a,b, equal_var=False)
print(tmp.groupby('time', as_index=False).mean()[['time', 'count']])
print(f'Test Range: {test_range}, t: {results[0]}, p: {results[1]}\n')
# aggregate dfs for 1, 5 and 9 weeks
df = pd.DataFrame()
for i in range(len(analysis_dfs)):
tmp = analysis_dfs[i]
if tmp['test_range'].mean() in [7,35,63]:
df = pd.concat([df, tmp])
before = df[df['time']=='before']
before.drop(columns='time', inplace=True)
before.rename(columns={'count':'before'}, inplace=True)
after = df[df['time']=='after']
after.drop(columns='time', inplace=True)
after.rename(columns={'count':'after'}, inplace=True)
reg_df = before.merge(after, on=['cluster', 'party', 'gender', 'category', 'hashtag', 'peak', 'test_range'])
reg_df['diff'] = (reg_df['after'] - reg_df['before']) / reg_df['before']
std_df = suggestions_df.groupby(['cluster', 'party', 'gender'], as_index=False).std()
std_df['norm_std'] = (std_df['count'] - std_df['count'].min()) / (std_df['count'].max() - std_df['count'].min())
std_df.rename(columns={'count':'std'}, inplace=True)
std_df.head(3)
reg_df = reg_df.merge(std_df, on=['cluster', 'party', 'gender'], how='left')
reg_df.head(3)
# regression for test range 1 week
reg = smf.ols('diff ~ C(party) + C(gender) + C(category) + C(test_range) + norm_std',
data=reg_df[reg_df['test_range']==7]).fit()
reg.summary()
# regression for test range 5 week
reg = smf.ols('diff ~ C(party) + C(gender) + C(category) + C(test_range) + norm_std',
data=reg_df[reg_df['test_range']==35]).fit()
reg.summary()
# regression for test range 9 week
reg = smf.ols('diff ~ C(party) + C(gender) + C(category) + C(test_range) + norm_std',
data=reg_df[reg_df['test_range']==63]).fit()
reg.summary()